Importing of Packages--
import pandas as pd
import numpy as np
import math as m
import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
import plotly.express as px
Reading the first data file--
def read_df1():
df1=pd.read_csv("G:/Makaan_Properties_Details.csv",encoding='latin1')
return df1
print("Calling the read_data function--")
df1=read_df1()
print(df1.head(2))
df1.columns
Calling the read_data function--
Property_Name Property_id Property_type Property_status \
0 Arkiton Luxe 15446514 Apartment Under Construction
1 Keshav Akshar Ocean Pearl 15367414 Apartment Under Construction
Price_per_unit_area Posted_On \
0 4,285 1 day ago
1 7,000 2 days ago
Project_URL builder_id \
0 https://www.makaan.com/ahmedabad/arkiton-life-... 100563465.0
1 https://www.makaan.com/ahmedabad/keshav-naraya... 100009433.0
Builder_name Property_building_status ... is_furnished \
0 Arkiton life Space ACTIVE ... Unfurnished
1 Keshav Narayan Group ACTIVE ... Unfurnished
listing_domain_score is_plot is_RERA_registered is_Apartment \
0 4.0 False True True
1 4.0 False True True
is_ready_to_move is_commercial_Listing is_PentaHouse is_studio \
0 False False False False
1 False False False False
Listing_Category
0 sell
1 sell
[2 rows x 24 columns]
Index(['Property_Name', 'Property_id', 'Property_type', 'Property_status',
'Price_per_unit_area', 'Posted_On', 'Project_URL', 'builder_id',
'Builder_name', 'Property_building_status', 'No_of_BHK', 'Price',
'Size', 'description', 'is_furnished', 'listing_domain_score',
'is_plot', 'is_RERA_registered', 'is_Apartment', 'is_ready_to_move',
'is_commercial_Listing', 'is_PentaHouse', 'is_studio',
'Listing_Category'],
dtype='object')
Reading the second data file--
def read_df2():
df2=pd.read_csv("G:/Makaan_property_location_details.csv")
return df2
print("Calling the read_data function--")
df2=read_df2()
print(df2.head(2))
df2.columns
Calling the read_data function--
Property_id City_id City_name Locality_ID Locality_Name Longitude \
0 15579866 1 Ahmedabad 51749 Bodakdev 72.520195
1 15579809 1 Ahmedabad 51749 Bodakdev 72.502571
Latitude Sub_urban_ID Sub_urban_name
0 23.040195 10003 SG Highway
1 23.032154 10003 SG Highway
Index(['Property_id', 'City_id', 'City_name', 'Locality_ID', 'Locality_Name',
'Longitude', 'Latitude', 'Sub_urban_ID', 'Sub_urban_name'],
dtype='object')
Performing inner join to merge two data files--
data=df1.merge(df2,left_on='Property_id', right_on='Property_id',how = 'inner')
pd.set_option("display.max.columns",None)
data.head(2)
| Property_Name | Property_id | Property_type | Property_status | Price_per_unit_area | Posted_On | Project_URL | builder_id | Builder_name | Property_building_status | No_of_BHK | Price | Size | description | is_furnished | listing_domain_score | is_plot | is_RERA_registered | is_Apartment | is_ready_to_move | is_commercial_Listing | is_PentaHouse | is_studio | Listing_Category | City_id | City_name | Locality_ID | Locality_Name | Longitude | Latitude | Sub_urban_ID | Sub_urban_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Arkiton Luxe | 15446514 | Apartment | Under Construction | 4,285 | 1 day ago | https://www.makaan.com/ahmedabad/arkiton-life-... | 100563465.0 | Arkiton life Space | ACTIVE | 3 BHK | 75,00,000 | 1,750 sq ft | The house is unfurnished. It has car parking. ... | Unfurnished | 4.0 | False | True | True | False | False | False | False | sell | 1 | Ahmedabad | 51907 | Bopal | 72.465103 | 23.037386 | 10229 | Ahmedabad West |
| 1 | Arkiton Luxe | 15446514 | Apartment | Under Construction | 4,285 | 1 day ago | https://www.makaan.com/ahmedabad/arkiton-life-... | 100563465.0 | Arkiton life Space | ACTIVE | 3 BHK | 75,00,000 | 1,750 sq ft | The house is unfurnished. It has car parking. ... | Unfurnished | 4.0 | False | True | True | False | False | False | False | sell | 1 | Ahmedabad | 51907 | Bopal | 72.465103 | 23.037386 | 10229 | Ahmedabad West |
Print basic info about data-
print(data.columns)
print("-------------------------------------------")
print("Rows,Columns--",data.shape)
print("-------------------------------------------")
print(data.info())
Index(['Property_Name', 'Property_id', 'Property_type', 'Property_status',
'Price_per_unit_area', 'Posted_On', 'Project_URL', 'builder_id',
'Builder_name', 'Property_building_status', 'No_of_BHK', 'Price',
'Size', 'description', 'is_furnished', 'listing_domain_score',
'is_plot', 'is_RERA_registered', 'is_Apartment', 'is_ready_to_move',
'is_commercial_Listing', 'is_PentaHouse', 'is_studio',
'Listing_Category', 'City_id', 'City_name', 'Locality_ID',
'Locality_Name', 'Longitude', 'Latitude', 'Sub_urban_ID',
'Sub_urban_name'],
dtype='object')
-------------------------------------------
Rows,Columns-- (4942704, 32)
-------------------------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4942704 entries, 0 to 4942703
Data columns (total 32 columns):
# Column Dtype
--- ------ -----
0 Property_Name object
1 Property_id int64
2 Property_type object
3 Property_status object
4 Price_per_unit_area object
5 Posted_On object
6 Project_URL object
7 builder_id float64
8 Builder_name object
9 Property_building_status object
10 No_of_BHK object
11 Price object
12 Size object
13 description object
14 is_furnished object
15 listing_domain_score float64
16 is_plot bool
17 is_RERA_registered bool
18 is_Apartment bool
19 is_ready_to_move bool
20 is_commercial_Listing bool
21 is_PentaHouse bool
22 is_studio bool
23 Listing_Category object
24 City_id int64
25 City_name object
26 Locality_ID int64
27 Locality_Name object
28 Longitude float64
29 Latitude float64
30 Sub_urban_ID int64
31 Sub_urban_name object
dtypes: bool(7), float64(4), int64(4), object(17)
memory usage: 1013.5+ MB
None
Pre-processing of data--
Checking for missing values--
data.isnull().sum()
Property_Name 1715856 Property_id 0 Property_type 0 Property_status 2895441 Price_per_unit_area 0 Posted_On 0 Project_URL 0 builder_id 4205920 Builder_name 4205920 Property_building_status 0 No_of_BHK 0 Price 0 Size 0 description 1 is_furnished 0 listing_domain_score 0 is_plot 0 is_RERA_registered 0 is_Apartment 0 is_ready_to_move 0 is_commercial_Listing 0 is_PentaHouse 0 is_studio 0 Listing_Category 0 City_id 0 City_name 0 Locality_ID 0 Locality_Name 2 Longitude 0 Latitude 0 Sub_urban_ID 0 Sub_urban_name 0 dtype: int64
1.Checking and removing special characters-\n 2.Data Type conversion-
data['Price_per_unit_area'].unique()
array(['4,285', '3,600', '7,000', ..., '11,763', '39,464', '38,910'],
dtype=object)
data['Price'].unique()
array(['75,00,000', '63,00,000', '2,36,88,000', ..., '26,49,999',
'26,98,434', '35,29,577'], dtype=object)
data['Size'].unique()
array(['1,750 sq ft', '3,384 sq ft', '2,295 sq ft', ..., '4,556 sq ft',
'18,837 sq ft', '226 sq ft'], dtype=object)
data['Price_per_unit_area'] = data['Price_per_unit_area'].replace(',', '',regex=True)
data['Price_per_unit_area']=data['Price_per_unit_area'].astype(int)
data['Price'] = data['Price'].replace(',', '',regex=True)
data['Price']=data['Price'].astype(int)
data['Size']=data['Size'].replace("sq ft","",regex=True)
data['Size']=data['Size'].replace(",","",regex=True)
data['Size']=data['Size'].astype(int)
1.Columns 'Price_per_unit_area','Price' have object datatype we are changing it to int type and also removing the comma.
data['No_of_BHK'].unique()
array(['3 BHK', '4 BHK', '2 BHK', '5 BHK', '1 BHK', '1 RK', '0 BHK',
'12 BHK', '7 BHK', '6 BHK', '8 BHK', '10 BHK', '11 BHK', '9 BHK',
'15 BHK', '3 RK', '14 BHK', '2 RK'], dtype=object)
print(len(data[data['No_of_BHK']=='1 RK']))
print(len(data[data['No_of_BHK']=='2 RK']))
print(len(data[data['No_of_BHK']=='3 RK']))
print(data.shape)
7271 2 4 (4942704, 32)
As we can see we have few RKs in BHK column.If we consider our data they are few thousands in number.so lets drop this RKs.
data.drop(data[(data['No_of_BHK']=='1 RK') |(data['No_of_BHK']=='2 RK') | (data['No_of_BHK']=='3 RK')].index,inplace=True)
data['No_of_BHK'].unique()
array(['3 BHK', '4 BHK', '2 BHK', '5 BHK', '1 BHK', '0 BHK', '12 BHK',
'7 BHK', '6 BHK', '8 BHK', '10 BHK', '11 BHK', '9 BHK', '15 BHK',
'14 BHK'], dtype=object)
We have "0 BHK" lets analyse them-
data[data['No_of_BHK']=='0 BHK'].head(2)
| Property_Name | Property_id | Property_type | Property_status | Price_per_unit_area | Posted_On | Project_URL | builder_id | Builder_name | Property_building_status | No_of_BHK | Price | Size | description | is_furnished | listing_domain_score | is_plot | is_RERA_registered | is_Apartment | is_ready_to_move | is_commercial_Listing | is_PentaHouse | is_studio | Listing_Category | City_id | City_name | Locality_ID | Locality_Name | Longitude | Latitude | Sub_urban_ID | Sub_urban_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1191 | NaN | 15528030 | Residential Plot | Ready to move | 13650 | 9 days ago | https://www.makaan.com/ahmedabad/builder-proje... | NaN | NaN | UNVERIFIED | 0 BHK | 43000000 | 3150 | A plot is available at a prime location in Juh... | Unfurnished | 4.0 | True | False | False | True | False | False | False | sell | 1 | Ahmedabad | 52604 | Juhapura | 72.523506 | 22.996189 | 10229 | Ahmedabad West |
| 1192 | NaN | 15528240 | Residential Plot | Ready to move | 518 | 9 days ago | https://www.makaan.com/ahmedabad/builder-proje... | NaN | NaN | UNVERIFIED | 0 BHK | 700000 | 1350 | A plot is available at a prime location in Gor... | Unfurnished | 4.0 | True | False | False | True | False | False | False | sell | 1 | Ahmedabad | 105460 | Goraj | 72.320763 | 22.978149 | 10094 | Other |
As we can see this '0 BHKs' are Residential Plots. Cleaning BHK column-
data['No_of_BHK']=data['No_of_BHK'].replace('BHK', '',regex=True)
data['No_of_BHK']=data['No_of_BHK'].astype(int)
data.columns
data.head(2)
| Property_Name | Property_id | Property_type | Property_status | Price_per_unit_area | Posted_On | Project_URL | builder_id | Builder_name | Property_building_status | No_of_BHK | Price | Size | description | is_furnished | listing_domain_score | is_plot | is_RERA_registered | is_Apartment | is_ready_to_move | is_commercial_Listing | is_PentaHouse | is_studio | Listing_Category | City_id | City_name | Locality_ID | Locality_Name | Longitude | Latitude | Sub_urban_ID | Sub_urban_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Arkiton Luxe | 15446514 | Apartment | Under Construction | 4285 | 1 day ago | https://www.makaan.com/ahmedabad/arkiton-life-... | 100563465.0 | Arkiton life Space | ACTIVE | 3 | 7500000 | 1750 | The house is unfurnished. It has car parking. ... | Unfurnished | 4.0 | False | True | True | False | False | False | False | sell | 1 | Ahmedabad | 51907 | Bopal | 72.465103 | 23.037386 | 10229 | Ahmedabad West |
| 1 | Arkiton Luxe | 15446514 | Apartment | Under Construction | 4285 | 1 day ago | https://www.makaan.com/ahmedabad/arkiton-life-... | 100563465.0 | Arkiton life Space | ACTIVE | 3 | 7500000 | 1750 | The house is unfurnished. It has car parking. ... | Unfurnished | 4.0 | False | True | True | False | False | False | False | sell | 1 | Ahmedabad | 51907 | Bopal | 72.465103 | 23.037386 | 10229 | Ahmedabad West |
data["Listing_Category"].unique()
array(['sell'], dtype=object)
Dropping irrelevant columns--
data.drop(columns=['Property_id','Posted_On', 'Project_URL','builder_id','Builder_name','description',
'listing_domain_score','Listing_Category', 'City_id','Locality_ID','Longitude', 'Latitude','Sub_urban_ID'],inplace=True)
data.head(2)
| Property_Name | Property_type | Property_status | Price_per_unit_area | Property_building_status | No_of_BHK | Price | Size | is_furnished | is_plot | is_RERA_registered | is_Apartment | is_ready_to_move | is_commercial_Listing | is_PentaHouse | is_studio | City_name | Locality_Name | Sub_urban_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Arkiton Luxe | Apartment | Under Construction | 4285 | ACTIVE | 3 | 7500000 | 1750 | Unfurnished | False | True | True | False | False | False | False | Ahmedabad | Bopal | Ahmedabad West |
| 1 | Arkiton Luxe | Apartment | Under Construction | 4285 | ACTIVE | 3 | 7500000 | 1750 | Unfurnished | False | True | True | False | False | False | False | Ahmedabad | Bopal | Ahmedabad West |
1.'Property_id','Posted_On', 'Project_URL','builder_id','Builder_name','description','listing_domain_score' this parameters will
have no effect on price of Property. hence,we are dropping this columns.
2."Listing_Category" column infers that all the properties are on sell.thus,this column isn't representing any distinct attribute.hence,we are dropping this column.
3.We are keeping City_name,Locality_Name,Sub_urban_name columns.'City_id','Locality_ID','Sub_urban_ID' display the same thing.hence,we are dropping this columns as well.
4.We alredy have a city_name hence,keeping 'Longitude' and 'Latitude' column will be of no use.henceforth,we are dropping this columns.
data.columns
Index(['Property_Name', 'Property_type', 'Property_status',
'Price_per_unit_area', 'Property_building_status', 'No_of_BHK', 'Price',
'Size', 'is_furnished', 'is_plot', 'is_RERA_registered', 'is_Apartment',
'is_ready_to_move', 'is_commercial_Listing', 'is_PentaHouse',
'is_studio', 'City_name', 'Locality_Name', 'Sub_urban_name'],
dtype='object')
Dropping some more columns--
data[data['is_plot']==True].head(2)
| Property_Name | Property_type | Property_status | Price_per_unit_area | Property_building_status | No_of_BHK | Price | Size | is_furnished | is_plot | is_RERA_registered | is_Apartment | is_ready_to_move | is_commercial_Listing | is_PentaHouse | is_studio | City_name | Locality_Name | Sub_urban_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1191 | NaN | Residential Plot | Ready to move | 13650 | UNVERIFIED | 0 | 43000000 | 3150 | Unfurnished | True | False | False | True | False | False | False | Ahmedabad | Juhapura | Ahmedabad West |
| 1192 | NaN | Residential Plot | Ready to move | 518 | UNVERIFIED | 0 | 700000 | 1350 | Unfurnished | True | False | False | True | False | False | False | Ahmedabad | Goraj | Other |
data[data['is_plot']==False].head(2)
| Property_Name | Property_type | Property_status | Price_per_unit_area | Property_building_status | No_of_BHK | Price | Size | is_furnished | is_plot | is_RERA_registered | is_Apartment | is_ready_to_move | is_commercial_Listing | is_PentaHouse | is_studio | City_name | Locality_Name | Sub_urban_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Arkiton Luxe | Apartment | Under Construction | 4285 | ACTIVE | 3 | 7500000 | 1750 | Unfurnished | False | True | True | False | False | False | False | Ahmedabad | Bopal | Ahmedabad West |
| 1 | Arkiton Luxe | Apartment | Under Construction | 4285 | ACTIVE | 3 | 7500000 | 1750 | Unfurnished | False | True | True | False | False | False | False | Ahmedabad | Bopal | Ahmedabad West |
data.drop(['is_plot'],axis=1,inplace=True)
We can observe that if property is Residential Plot then 'is_plot' column display True as outcome.which simply means 'is_plot' is revealing one of Property_type only .hence,we are dropping this column.
data.is_Apartment.unique()
array([ True, False])
data[data['is_Apartment']==True].head(2)
| Property_Name | Property_type | Property_status | Price_per_unit_area | Property_building_status | No_of_BHK | Price | Size | is_furnished | is_RERA_registered | is_Apartment | is_ready_to_move | is_commercial_Listing | is_PentaHouse | is_studio | City_name | Locality_Name | Sub_urban_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Arkiton Luxe | Apartment | Under Construction | 4285 | ACTIVE | 3 | 7500000 | 1750 | Unfurnished | True | True | False | False | False | False | Ahmedabad | Bopal | Ahmedabad West |
| 1 | Arkiton Luxe | Apartment | Under Construction | 4285 | ACTIVE | 3 | 7500000 | 1750 | Unfurnished | True | True | False | False | False | False | Ahmedabad | Bopal | Ahmedabad West |
data[data['is_Apartment']==False].head(2)
| Property_Name | Property_type | Property_status | Price_per_unit_area | Property_building_status | No_of_BHK | Price | Size | is_furnished | is_RERA_registered | is_Apartment | is_ready_to_move | is_commercial_Listing | is_PentaHouse | is_studio | City_name | Locality_Name | Sub_urban_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 205 | Avirat Silver Luxuria | Independent House | Under Construction | 4300 | ACTIVE | 4 | 17845000 | 4150 | Unfurnished | True | False | False | False | False | False | Ahmedabad | Gota | Ahmedabad North |
| 206 | Avirat Silver Luxuria | Independent House | Under Construction | 4300 | ACTIVE | 4 | 17845000 | 4150 | Unfurnished | True | False | False | False | False | False | Ahmedabad | Gota | Ahmedabad North |
We can observe that if property is Apartment then 'is_Apartment' column display True as outcome.which simply means 'is_Apartment' is revealing one of Property_type only.
hence,we are dropping this column.
data.drop(['is_Apartment'],axis=1,inplace=True)
data['is_ready_to_move'].unique()
array([False, True])
data[data['is_ready_to_move']==False].head(2)
| Property_Name | Property_type | Property_status | Price_per_unit_area | Property_building_status | No_of_BHK | Price | Size | is_furnished | is_RERA_registered | is_ready_to_move | is_commercial_Listing | is_PentaHouse | is_studio | City_name | Locality_Name | Sub_urban_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Arkiton Luxe | Apartment | Under Construction | 4285 | ACTIVE | 3 | 7500000 | 1750 | Unfurnished | True | False | False | False | False | Ahmedabad | Bopal | Ahmedabad West |
| 1 | Arkiton Luxe | Apartment | Under Construction | 4285 | ACTIVE | 3 | 7500000 | 1750 | Unfurnished | True | False | False | False | False | Ahmedabad | Bopal | Ahmedabad West |
We can see that columns 'is_ready_to_move' and 'Property_status' depicts the same thing. hence, we are dropping 'is_ready_to_move'
column.
data.drop('is_ready_to_move',axis=1,inplace=True)
data['is_commercial_Listing'].unique()
array([False])
data.drop('is_commercial_Listing',axis=1,inplace=True)
We can see that column 'is_commercial_Listing' has only one outcome False which simplifies that no house is commercially
listed. this column isn't relaying any valuable information.hence,we are dropping this column.
data['is_studio'].unique()
array([False])
data.drop('is_studio',axis=1,inplace=True)
We can see that column 'is_studio' has only one outcome False which depicts that no property is studio. this column isn't relaying any
valuable information.hence,we are dropping this column.
data.columns,data.shape
(Index(['Property_Name', 'Property_type', 'Property_status',
'Price_per_unit_area', 'Property_building_status', 'No_of_BHK', 'Price',
'Size', 'is_furnished', 'is_RERA_registered', 'is_PentaHouse',
'City_name', 'Locality_Name', 'Sub_urban_name'],
dtype='object'),
(4935427, 14))
data.isnull().sum()
Property_Name 1711591 Property_type 0 Property_status 2895441 Price_per_unit_area 0 Property_building_status 0 No_of_BHK 0 Price 0 Size 0 is_furnished 0 is_RERA_registered 0 is_PentaHouse 0 City_name 0 Locality_Name 2 Sub_urban_name 0 dtype: int64
We are going to use Property_Name column to split our data into train and test. So for now lets work on filling
Property_status null values.
data['Property_status'].unique()
array(['Under Construction', 'Ready to move', nan], dtype=object)
X=data[data["Property_status"].isnull()]
We create a 'X' variable to store data where property status is null. lets examine property type of this null data.
X["Property_type"].value_counts()
Residential Plot 2895420 Apartment 18 Villa 3 Name: Property_type, dtype: int64
data[data["Property_type"]=='Residential Plot'].head(2)
| Property_Name | Property_type | Property_status | Price_per_unit_area | Property_building_status | No_of_BHK | Price | Size | is_furnished | is_RERA_registered | is_PentaHouse | City_name | Locality_Name | Sub_urban_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1191 | NaN | Residential Plot | Ready to move | 13650 | UNVERIFIED | 0 | 43000000 | 3150 | Unfurnished | False | False | Ahmedabad | Juhapura | Ahmedabad West |
| 1192 | NaN | Residential Plot | Ready to move | 518 | UNVERIFIED | 0 | 700000 | 1350 | Unfurnished | False | False | Ahmedabad | Goraj | Other |
We can note that Property_status of Residential Plot is 'Ready to move'.accordingly lets fill
the Property_status as 'Ready to move' for nan values.but prior to filling we are dropping data where 'Property_status' is null and 'Property_type' is either Apartment or Villa.(Note that they are very few in numbers.)
i=data[(data['Property_status'].isnull()) & ((data['Property_type']=='Apartment') | (data['Property_type']=='Villa'))].index
data.drop(i,inplace=True)
data['Property_status'].fillna('Ready to move',inplace=True)
Dropping null values from Locality_Name--
y=data[data['Locality_Name'].isnull()].index
data.drop(y,inplace=True)
data.isnull().sum()
Property_Name 1711591 Property_type 0 Property_status 0 Price_per_unit_area 0 Property_building_status 0 No_of_BHK 0 Price 0 Size 0 is_furnished 0 is_RERA_registered 0 is_PentaHouse 0 City_name 0 Locality_Name 0 Sub_urban_name 0 dtype: int64
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 4935404 entries, 0 to 4942703 Data columns (total 14 columns): # Column Dtype --- ------ ----- 0 Property_Name object 1 Property_type object 2 Property_status object 3 Price_per_unit_area int32 4 Property_building_status object 5 No_of_BHK int32 6 Price int32 7 Size int32 8 is_furnished object 9 is_RERA_registered bool 10 is_PentaHouse bool 11 City_name object 12 Locality_Name object 13 Sub_urban_name object dtypes: bool(2), int32(4), object(8) memory usage: 423.6+ MB
Changing datatype--
data['is_RERA_registered'].unique(),data['is_PentaHouse'].unique()
(array([ True, False]), array([False, True]))
data['is_RERA_registered'].dtype,data['is_PentaHouse'].dtype
(dtype('bool'), dtype('bool'))
data['is_RERA_registered']=data['is_RERA_registered'].astype('object')
data['is_PentaHouse']=data['is_PentaHouse'].astype('object')
data['is_RERA_registered'].unique(),data['is_PentaHouse'].unique()
(array([True, False], dtype=object), array([False, True], dtype=object))
data.dtypes
Property_Name object Property_type object Property_status object Price_per_unit_area int32 Property_building_status object No_of_BHK int32 Price int32 Size int32 is_furnished object is_RERA_registered object is_PentaHouse object City_name object Locality_Name object Sub_urban_name object dtype: object
data.columns
Index(['Property_Name', 'Property_type', 'Property_status',
'Price_per_unit_area', 'Property_building_status', 'No_of_BHK', 'Price',
'Size', 'is_furnished', 'is_RERA_registered', 'is_PentaHouse',
'City_name', 'Locality_Name', 'Sub_urban_name'],
dtype='object')
data.head(2),data.shape
( Property_Name Property_type Property_status Price_per_unit_area \ 0 Arkiton Luxe Apartment Under Construction 4285 1 Arkiton Luxe Apartment Under Construction 4285 Property_building_status No_of_BHK Price Size is_furnished \ 0 ACTIVE 3 7500000 1750 Unfurnished 1 ACTIVE 3 7500000 1750 Unfurnished is_RERA_registered is_PentaHouse City_name Locality_Name Sub_urban_name 0 True False Ahmedabad Bopal Ahmedabad West 1 True False Ahmedabad Bopal Ahmedabad West , (4935404, 14))
sns.boxplot(data=data.loc[:,[ 'Price_per_unit_area','No_of_BHK', 'Price','Size']])
<Axes: >
Q1 = data.Price.quantile(0.25)
Q3 = data.Price.quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5*IQR
print(lower)
upper = Q3 + 1.5*IQR
print(upper)
data = data[(data.Price > (Q1 - 1.5*IQR)) & (data.Price < (Q3 + 1.5*IQR))]
sns.boxplot(y='Price',data=data,palette='hot')
-1661500.0 4366500.0
<Axes: ylabel='Price'>
plt.xticks(rotation=90,fontsize="medium")
print(data['Property_type'].value_counts())
sns.countplot(data=data,x=data['Property_type'],palette="Set1")
plt.title("Property Types",fontweight="bold",fontsize=15)
Residential Plot 3880736 Apartment 102400 Villa 30613 Independent Floor 28903 Independent House 6180 Name: Property_type, dtype: int64
Text(0.5, 1.0, 'Property Types')
Mostly property is Residential followed by Apartments.
print(data['Property_status'].value_counts())
sns.countplot(data=data,x=data['Property_status'],palette="Set1")
plt.title("Property Status",fontweight="bold",fontsize=15)
Ready to move 3970666 Under Construction 78166 Name: Property_status, dtype: int64
Text(0.5, 1.0, 'Property Status')
Most plots are ready to move only Few are under construction
print(data['Property_building_status'].value_counts())
sns.countplot(data=data,x=data['Property_building_status'],palette="viridis_r")
plt.title("Property building status",fontweight="bold",fontsize=15)
UNVERIFIED 3737414 ACTIVE 311351 INACTIVE 67 Name: Property_building_status, dtype: int64
Text(0.5, 1.0, 'Property building status')
For most properties building status is unverified
sns.displot(x='No_of_BHK',data=data,color="orange",bins=15)
print(data['No_of_BHK'].value_counts())
plt.title("Distribution plot for No of BHKs",fontweight="bold",fontsize=15)
0 3880736 2 86667 1 53609 3 20162 5 7546 4 110 8 1 6 1 Name: No_of_BHK, dtype: int64
Text(0.5, 1.0, 'Distribution plot for No of BHKs')
0 BHKs represents Residential plots thus, from above distribution we can conclude that Residential plots are highly available.
sns.lineplot(data=data,x='No_of_BHK',y='Price',color="blue")
plt.title("No of BHKs vs Price",fontweight="bold",fontsize=15)
Text(0.5, 1.0, 'No of BHKs vs Price')
As No of BHK increaes from 0 to 3 overall price is also rising however, there is fluctuation in price thereupon.
sns.scatterplot(data=data,x='Size',y='Price',color="green")
plt.title("Size vs Price",fontweight="bold",fontsize=15)
plt.xlabel("Size in sq ft")
Text(0.5, 0, 'Size in sq ft')
Above scatterplot depicts that Size of property have impact on price.
plt.xticks(rotation=90,fontsize="medium")
sns.countplot(x='City_name',data=data)
plt.title("City wise property options",fontweight="bold",fontsize=15)
data.City_name.value_counts()
Lucknow 3510702 Bangalore 129470 Hyderabad 114286 Chennai 111825 Kolkata 70076 Mumbai 64794 Delhi 43370 Ahmedabad 4309 Name: City_name, dtype: int64
Lucknow has highest number of property options.
plt.xticks(rotation=90,fontsize="medium")
sns.barplot(data=data,x='City_name',y="Price",palette="Set2")
plt.title("City wise Price",fontweight="bold",fontsize=15)
Text(0.5, 1.0, 'City wise Price')
Ahmedabad has highest property prices while lucknow offers cheaper properties.
meanprice = data.groupby(["City_name",'Property_type'])['Price'].mean().reset_index()
fig = px.treemap(meanprice, path=["City_name",'Property_type'], values='Price', color='City_name',width=900, height=1000,
title="Property wise Mean Price in Cities",
color_discrete_map={'Chennai':'red', 'Ahmedabad':'darkblue',"Delhi":"black","Bangalore":"darkred","Kolkata":"purple",
"Mumbai":"blue",'Hyderabad':"darkcoral","Lucknow":"green"})
fig.update_layout(title='<b>' "City wise Mean Price of Properties" '<b>')
fig.update_traces(root_color="grey")
fig.show(renderer="notebook")
Independent Floor in Chennai are most expensive whilst Residential plots in Mumbai cheaper among properties.
print(data['is_RERA_registered'].value_counts())
sns.countplot(x=data['is_RERA_registered'],palette="Set1")
plt.title("RERA Status",fontweight="bold",fontsize=15)
False 4017967 True 30865 Name: is_RERA_registered, dtype: int64
Text(0.5, 1.0, 'RERA Status')
Very few properties are registered under RERA.
print(data['is_PentaHouse'].value_counts())
sns.countplot(x=data['is_PentaHouse'],palette='hsv')
plt.title("PentaHouse Status",fontweight="bold",fontsize=15)
False 4048829 True 3 Name: is_PentaHouse, dtype: int64
Text(0.5, 1.0, 'PentaHouse Status')
Only 3 Pentahouses are available.
print(data['is_furnished'].value_counts())
sns.countplot(data=data,x=data['is_furnished'],palette="hot")
plt.title("Furnished Status",fontweight="bold",fontsize=15)
Unfurnished 3991021 Semi-Furnished 43998 Furnished 13813 Name: is_furnished, dtype: int64
Text(0.5, 1.0, 'Furnished Status')
Most of the properties are Unfurnished.
data[['Property_Name', 'Property_type', 'Property_status',
'Price_per_unit_area', 'Property_building_status', 'No_of_BHK', 'Price',
'Size', 'is_furnished', 'is_RERA_registered', 'is_PentaHouse',
'City_name', 'Locality_Name', 'Sub_urban_name']].corr()
C:\Users\hp\AppData\Local\Temp\ipykernel_12772\4181590655.py:4: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
| Price_per_unit_area | No_of_BHK | Price | Size | |
|---|---|---|---|---|
| Price_per_unit_area | 1.000000 | 0.567791 | 0.830722 | -0.174036 |
| No_of_BHK | 0.567791 | 1.000000 | 0.489565 | -0.096577 |
| Price | 0.830722 | 0.489565 | 1.000000 | 0.191478 |
| Size | -0.174036 | -0.096577 | 0.191478 | 1.000000 |
plt.figure(figsize=(8,6))
sns.heatmap(data[['Property_Name', 'Property_type', 'Property_status',
'Price_per_unit_area', 'Property_building_status', 'No_of_BHK', 'Price',
'Size', 'is_furnished', 'is_RERA_registered', 'is_PentaHouse',
'City_name', 'Locality_Name', 'Sub_urban_name']].corr(),annot=True,cmap="viridis")
C:\Users\hp\AppData\Local\Temp\ipykernel_12772\1142607767.py:5: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
<Axes: >
from sklearn.preprocessing import LabelEncoder
lb= LabelEncoder()
data['is_RERA_registered']=lb.fit_transform(data['is_RERA_registered'])
data['is_PentaHouse']=lb.fit_transform(data['is_PentaHouse'])
data['Sub_urban_name'].nunique(),data["Locality_Name"].nunique()
(94, 3396)
Dropping Sub_urban_name and Locality_Name--
data.drop("Sub_urban_name",axis=1,inplace=True)
data.drop("Locality_Name",axis=1,inplace=True)
data.head(2)
data_=data.copy()
Computing indicator / dummy variables--
data=pd.get_dummies(data,columns=['Property_type','Property_status','Property_building_status','is_furnished','City_name'])
data.head(2)
| Property_Name | Price_per_unit_area | No_of_BHK | Price | Size | is_RERA_registered | is_PentaHouse | Property_type_Apartment | Property_type_Independent Floor | Property_type_Independent House | Property_type_Residential Plot | Property_type_Villa | Property_status_Ready to move | Property_status_Under Construction | Property_building_status_ACTIVE | Property_building_status_INACTIVE | Property_building_status_UNVERIFIED | is_furnished_Furnished | is_furnished_Semi-Furnished | is_furnished_Unfurnished | City_name_Ahmedabad | City_name_Bangalore | City_name_Chennai | City_name_Delhi | City_name_Hyderabad | City_name_Kolkata | City_name_Lucknow | City_name_Mumbai | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 22 | Satyam Sarjan | 2486 | 2 | 2283000 | 918 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 27 | Kailash The Willows | 2593 | 2 | 3385000 | 1305 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()
sc1= StandardScaler()
#Standard scale No_of_BHK, Price_per_unit_area, Size
data['No_of_BHK']=sc.fit_transform(data[['No_of_BHK']])
data['Price_per_unit_area']=sc.fit_transform(data[['Price_per_unit_area']])
data['Size']=sc.fit_transform(data[['Size']])
data['Price']=sc1.fit_transform(data[['Price']])
data.head(2)
| Property_Name | Price_per_unit_area | No_of_BHK | Price | Size | is_RERA_registered | is_PentaHouse | Property_type_Apartment | Property_type_Independent Floor | Property_type_Independent House | Property_type_Residential Plot | Property_type_Villa | Property_status_Ready to move | Property_status_Under Construction | Property_building_status_ACTIVE | Property_building_status_INACTIVE | Property_building_status_UNVERIFIED | is_furnished_Furnished | is_furnished_Semi-Furnished | is_furnished_Unfurnished | City_name_Ahmedabad | City_name_Bangalore | City_name_Chennai | City_name_Delhi | City_name_Hyderabad | City_name_Kolkata | City_name_Lucknow | City_name_Mumbai | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 22 | Satyam Sarjan | 1.578174 | 4.471819 | 1.643631 | -0.456507 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 27 | Kailash The Willows | 1.691799 | 4.471819 | 3.071929 | 0.711509 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
print("Splitting data into train and test--")
data["Property_Name"].fillna('T',inplace=True)
train=data[data["Property_Name"]!='T']
test=data[data["Property_Name"]=='T']
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error
X_train=pd.concat([train.iloc[:,1:3],train.iloc[:,4:29]],axis=1)
y_train=train.iloc[:,3]
X_test=pd.concat([test.iloc[:,1:3],test.iloc[:,4:29]],axis=1)
y_test=test.iloc[:,3]
print(X_train.shape),
print(y_train.shape),
print(X_test.shape),
print(y_test.shape)
Splitting data into train and test-- (2613771, 26) (2613771,) (1435061, 26) (1435061,)
print("Lets build the Multiple Linear regression model")
def modelling(X_train,y_train,X_test):
modelL=LinearRegression()
modelL_train=modelL.fit(X_train,y_train)
print("ModelL training is completed--")
return modelL_train
print("Calling the modelling function--")
modelL_train=modelling(X_train,y_train,X_test)
def prediction():
predL=modelL_train.predict(X_test)
return predL
print("Calling prediction function--")
predL=prediction()
print(predL)
r2score_MLR=(round(r2_score(y_test,predL)*100,2))
rmse = m.sqrt(mean_squared_error(y_test,predL))
print("Multiple Linear Regression--")
print('r2score:',r2score_MLR)
print('RMSE:',rmse)
print('************************************************')
Lets build the Multiple Linear regression model Calling the modelling function-- ModelL training is completed-- Calling prediction function-- [8.57646227 1.14950848 1.99299574 ... 1.57563066 1.57563066 1.57563066] Multiple Linear Regression-- r2score: 86.76 RMSE: 0.3838977120496582 ************************************************
actual_scaled= pd.Series(data=y_test, index=test.index)
pred_scaled=pd.Series(data=predL, index=test.index)
scaled=pd.concat([actual_scaled,pred_scaled],axis=1)
scaled.columns = ["actual_scaled", "pred_scaled"]
print("Inverse transform scaled values--")
combined=sc1.inverse_transform(scaled)
df = pd.DataFrame(combined, columns =['actual', 'predL'])
print(df)
plt.style.use('seaborn-whitegrid')
df.plot(figsize= (18,6),colormap="viridis")
plt.legend(loc='best',bbox_to_anchor=(1,1),labelspacing=1,fontsize=20)
plt.title("Actual vs Predicted Property Prices-- Multiple Linear Regression Model", fontsize= 22)
plt.ylabel("Price",fontsize = 20)
plt.xticks(fontsize = 18)
plt.yticks(fontsize = 18)
plt.show()
Inverse transform scaled values--
actual predL
0 4300000.0 7.632011e+06
1 2700000.0 1.901761e+06
2 2500000.0 2.552551e+06
3 2200000.0 1.996435e+06
4 4200000.0 5.341510e+06
... ... ...
1435056 783650.0 7.276011e+05
1435057 1700000.0 2.230534e+06
1435058 1700000.0 2.230534e+06
1435059 1700000.0 2.230534e+06
1435060 1700000.0 2.230534e+06
[1435061 rows x 2 columns]
C:\Users\hp\AppData\Local\Temp\ipykernel_12772\3978307340.py:10: MatplotlibDeprecationWarning: The seaborn styles shipped by Matplotlib are deprecated since 3.6, as they no longer correspond to the styles shipped by seaborn. However, they will remain available as 'seaborn-v0_8-<style>'. Alternatively, directly use the seaborn API instead.
print("Lets build the Ridge regression model")
from sklearn.linear_model import Ridge
def modelling1(X_train,y_train,X_test):
model1=Ridge()
model1_train=model1.fit(X_train,y_train)
print("Model1 training is completed--")
return model1_train
print("Calling the modelling1 function--")
model1_train=modelling1(X_train,y_train,X_test)
def prediction():
pred1=model1_train.predict(X_test)
return pred1
print("Calling prediction function--")
pred1=prediction()
print(pred1)
r2score_Ridge=(round(r2_score(y_test,pred1)*100,2))
rmse = m.sqrt(mean_squared_error(y_test,pred1))
print("Ridge Regression--")
print('r2score:',r2score_Ridge)
print('RMSE:',rmse)
print('***********************************************')
Lets build the Ridge regression model Calling the modelling1 function-- Model1 training is completed-- Calling prediction function-- [8.57746535 1.15553819 1.99790626 ... 1.57330409 1.57330409 1.57330409] Ridge Regression-- r2score: 86.76 RMSE: 0.38388433185020177 ***********************************************
print("Lets build the Decision Tree Regression model")
from sklearn.tree import DecisionTreeRegressor
def modelling2():
model2=DecisionTreeRegressor(criterion='squared_error')
model2_train=model2.fit(X_train,y_train)
print("Model training is completed.")
return model2_train
print("Calling modelling2 function--")
model2_train=modelling2()
def prediction():
pred2=model2_train.predict(X_test)
return pred2
print("Calling prediction function--")
pred2=prediction()
print(pred2)
r2score_DT=(round(r2_score(y_test,pred2)*100,2))
rmse = m.sqrt(mean_squared_error(y_test,pred2))
print("Decision Tree Regression--")
print('r2score:',r2score_DT)
print('RMSE:',rmse)
print('***********************************************')
Lets build the Decision Tree Regression model Calling modelling2 function-- Model training is completed. Calling prediction function-- [4.25785659 2.31371278 1.92067819 ... 0.91392924 0.91392924 0.91392924] Decision Tree Regression-- r2score: 99.96 RMSE: 0.021152771428600073 ***********************************************
actual_scaled2= pd.Series(data=y_test, index=test.index)
pred_scaled2=pd.Series(data=pred2, index=test.index)
scaled2=pd.concat([actual_scaled2,pred_scaled2],axis=1)
scaled2.columns = ["actual_scaled2", "pred_scaled2"]
print("Inverse transform scaled values--")
combined2=sc1.inverse_transform(scaled2)
df2 = pd.DataFrame(combined2, columns =['actual2', 'pred2'])
print(df2)
plt.style.use('seaborn-whitegrid')
df2.plot(figsize= (18,6),colormap="viridis")
plt.legend(loc='best',bbox_to_anchor=(1,1),labelspacing=1,fontsize=20)
plt.title("Actual vs Predicted Property Prices--Decision Tree Regression model", fontsize= 22)
plt.ylabel("Price",fontsize = 20)
plt.xticks(fontsize = 18)
plt.yticks(fontsize = 18)
plt.show()
Inverse transform scaled values--
actual2 pred2
0 4300000.0 4300000.0
1 2700000.0 2800000.0
2 2500000.0 2496755.0
3 2200000.0 2200000.0
4 4200000.0 4200000.0
... ... ...
1435056 783650.0 760122.0
1435057 1700000.0 1720000.0
1435058 1700000.0 1720000.0
1435059 1700000.0 1720000.0
1435060 1700000.0 1720000.0
[1435061 rows x 2 columns]
C:\Users\hp\AppData\Local\Temp\ipykernel_12772\1239967356.py:10: MatplotlibDeprecationWarning: The seaborn styles shipped by Matplotlib are deprecated since 3.6, as they no longer correspond to the styles shipped by seaborn. However, they will remain available as 'seaborn-v0_8-<style>'. Alternatively, directly use the seaborn API instead.
print("Lets build the Random Forest Regression model")
from sklearn.ensemble import RandomForestRegressor
def modelling3():
model3=RandomForestRegressor(criterion="squared_error")
model3_train=model3.fit(X_train,y_train)
print("Model training is completed.")
return model3_train
print("Calling modelling3 function--")
model3_train=modelling3()
def prediction():
pred3=model3_train.predict(X_test)
return pred3
print("Calling prediction function--")
pred3=prediction()
print(pred3)
r2score_RF=(round(r2_score(y_test,pred3)*100,2))
rmse = m.sqrt(mean_squared_error(y_test,pred3))
print("Random Forest Regression--")
print('r2score:',r2score_RF)
print('RMSE:',rmse)
print('***********************************************')
Lets build the Random Forest Regression model Calling modelling3 function-- Model training is completed. Calling prediction function-- [4.2277075 2.19638743 1.92341198 ... 0.87336142 0.87336142 0.87336142] Random Forest Regression-- r2score: 99.99 RMSE: 0.012626570432088069 ***********************************************
import sqlalchemy as sa
import mysql.connector as sql
mydb = sa.engine.URL.create(
drivername="mysql",
host = "localhost",
username = "root",
password = "Fuchka@104",
database = "capstone"
)
print(mydb)
mysql://root:Fuchka%40104@localhost/capstone
details = {
'Model' : ['Multiple Linear Regression', 'Ridge Regression', 'Decision Tree Regression', 'Random Forest'],
'Accuracy %' : [r2score_MLR, r2score_Ridge, r2score_DT,r2score_RF]}
df = pd.DataFrame(details)
df
| Model | Accuracy % | |
|---|---|---|
| 0 | Multiple Linear Regression | 86.76 |
| 1 | Ridge Regression | 86.76 |
| 2 | Decision Tree Regression | 99.96 |
| 3 | Random Forest | 99.99 |
1.Radom Forest are giving highest accuracy.
2.We have to select optimum model hence,we are choosing property prices predicted by Multiple Linear Regression model
import joblib
Saving the Model Using Joblib
joblib.dump(modelL_train,"Makaan_Linear_Model.pkl")
joblib.dump(modelL_train,"Makaan_Linear_Model.joblib")
['Makaan_Linear_Model.joblib']
Loading the Saved Model Using Joblib
reg= joblib.load("Makaan_Linear_Model.joblib")
predictions = reg.predict(X_test)
predictions
array([8.57646227, 1.14950848, 1.99299574, ..., 1.57563066, 1.57563066,
1.57563066])
data_["Property_Name"].fillna('T',inplace=True)
test_=data_[data_["Property_Name"]=='T']
test_["Pred_Price"]=predL
#inverse_transform--
test_["Pred_Price"]=sc1.inverse_transform(test_["Pred_Price"].values.reshape(-1,1))
from sqlalchemy import create_engine
engine = create_engine("mysql+pymysql://root:Fuchka%40104@localhost/capstone")
con=engine.connect()
test_.to_sql(con=con,name="makaan_pred_prices",if_exists="replace")
C:\Users\hp\AppData\Local\Temp\ipykernel_12772\1285901267.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy C:\Users\hp\AppData\Local\Temp\ipykernel_12772\1285901267.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
1435061